options(scipen = 999)
library(tidyverse)
library(censusapi)
library(urbnthemes)
library(reactable)
library(kableExtra)
library(here)
set_urbn_defaults(style = "print")
source(here::here("06_neighborhoods", "R", "census_api_key.R"))
source(here::here("06_neighborhoods", "R", "get_vars.R"))Poverty Exposure - City
This metric is the share of people experiencing poverty in a census place who live in census tracts with poverty rates over 40%. If a place’s overall poverty rate is 20% but people in poverty are spread out evenly across all census tracts, the index would equal 0; if they were heavily concentrated in certain tracts, the index would approach 1.
Process
- Pull people and poverty rates for census tracts.
- Create the “Other Races and Ethnicities” subgroup.
- Count the number of people in poverty who live in census tracts with poverty > 40% in each place.
- Crosswalk census tracts to census places
- Summarize the tract data to the place-level.
- Divide the number from 2. by the total number of people in poverty in each census tract.
- Validation
- Data quality flags
- Save the data
Housekeeping
All numbers come for the Census API. The documentation for the Census API is available here. We pull all of the race/ethnicity counts for 2021 using library(censusapi). Note: This will require a Census API key. Add the key to census_api_key-template.R and then delete “template”. It is sourced above.
To do this we have to identify census tracts with poverty rates over 40% in each census place, count the number of residents in those tracts who are poor, sum that up and divided it by the total number of poor residents in the census place.
1. Pull people and poverty rates for census tracts
https://api.census.gov/data/2021/acs/acs5/variables.html
get_poverty_status_data <- function(year, geography) {
vars <- c( # Estimate!!Total!!Income in the past 12 months below poverty level
# "B00001_001E", # UNWEIGHTED SAMPLE COUNT OF THE POPULATION
# "B01001_001E", # SEX BY AGE
people = "B17001_001E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (Total)
poverty = "B17001_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE
poverty_moe = "B17001_002M",
# "B17001A_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE)
# "B17001A_002M",
poverty_black = "B17001B_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)
poverty_black_moe = "B17001B_002M",
poverty_aian = "B17001C_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)
poverty_aian_moe = "B17001C_002M",
poverty_asian = "B17001D_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (ASIAN ALONE)
poverty_asian_moe = "B17001D_002M",
poverty_pacific = "B17001E_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)
poverty_pacific_moe = "B17001E_002M",
poverty_other = "B17001F_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (SOME OTHER RACE ALONE)
poverty_other_moe = "B17001F_002M",
poverty_twoplus = "B17001G_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (TWO OR MORE RACES)
poverty_twoplus_moe = "B17001G_002M",
poverty_white_nonhispanic = "B17001H_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (WHITE ALONE, NOT HISPANIC OR LATINO)
poverty_white_nonhispanic_moe = "B17001H_002M",
poverty_hispanic = "B17001I_002E", # POVERTY STATUS IN THE PAST 12 MONTHS BY SEX BY AGE (HISPANIC OR LATINO)
poverty_hispanic_moe = "B17001I_002M"
)
# pull census tracts for 2021
state_fips <-
paste0(
"state:",
c(
"01", "02", "04", "05", "06", "08", "09", "10", "11", "12",
"13", "15", "16", "17", "18", "19", "20", "21", "22", "23",
"24", "25", "26", "27", "28", "29", "30", "31", "32", "33",
"34", "35", "36", "37", "38", "39", "40", "41", "42", "44",
"45", "46", "47", "48", "49", "50", "51", "53", "54", "55",
"56"
)
)
tracts <- map_df(state_fips, ~ getCensus(
name = "acs/acs5",
vars = vars,
region = paste0(geography, ":*"),
regionin = .x,
vintage = {{ year }}
)) %>%
as_tibble() %>%
mutate(year = {{ year }})
# rename the variables
tracts <- tracts %>%
rename(all_of(vars))
return(tracts)
}
years <- c(2014, 2016, 2018, 2021, 2023)
if (!file.exists(here::here("06_neighborhoods/poverty-exposure/data/tracts.csv"))) {
tracts_raw <- map(years, ~ get_poverty_status_data(.x, geography = "tract")) |>
list_rbind() |>
relocate(year, .before = everything())
if (!dir.exists(here::here("06_neighborhoods/poverty-exposure/data"))) {
dir.create(here::here("06_neighborhoods/poverty-exposure/data"))
}
write_csv(tracts_raw, here("06_neighborhoods/poverty-exposure/data/tracts.csv"))
}tracts_raw <- read_csv(here::here("06_neighborhoods/poverty-exposure/data/tracts.csv"))Some tracts don’t have any population. We drop those tracts.
tracts <- tracts_raw %>%
tidylog::filter(people > 0)Check the number of people. It should be around 321,897,703 in 2021.
tracts %>%
summarize(sum(people), .by = year)# A tibble: 5 × 2
year `sum(people)`
<dbl> <dbl>
1 2014 306226394
2 2016 310629645
3 2018 314904186
4 2021 321897703
5 2023 324485878
2. Create the “Other Races and Ethnicities” subgroup
We need to combine the small groups into a group for other races and ethnicities. The Census Bureau typically only posts cross tabs for up to two variables. This requires race, ethnicity, and poverty status so the resulting groups are not disjoint.
Combine the race/ethnicity groups into the group of interest.
tracts <- tracts %>%
mutate(
poverty_other_races =
poverty_aian +
poverty_asian +
poverty_pacific +
poverty_other +
poverty_twoplus
)This Census presentation recommends using the maximum margin of error when aggregating multiple zero estimates.
One way this approximation can differ from the actual MOE is if you were aggregating multiple zero estimates. In this case, the approximate MOE could diverge from the actual margin of error. And so the - our recommendation is to only include one zero estimate margin of error and include the largest one.
# pivot the point estimates
values <- tracts %>%
select(
year,
state,
county,
tract,
poverty_aian,
poverty_asian,
poverty_pacific,
poverty_other,
poverty_twoplus
) %>%
pivot_longer(c(-year, -state, -county, -tract), names_to = "group", values_to = "value")
# pivot the margins of error
moes <- tracts %>%
select(
year,
state,
county,
tract,
poverty_aian_moe,
poverty_asian_moe,
poverty_pacific_moe,
poverty_other_moe,
poverty_twoplus_moe
) %>%
pivot_longer(c(-year, -state, -county, -tract), names_to = "group", values_to = "moe") %>%
mutate(group = str_replace(group, "_moe", ""))
# combine the point estimates and margins of error
other_moe <- left_join(values, moes, by = c("year", "state", "county", "tract", "group"))
rm(moes, values)
# keep MOE for non-zero estimates and keep the largest MOE for zero estimates
other_moe <- other_moe %>%
group_by(year, state, county, tract) %>%
mutate(moe_rank = row_number(desc(moe))) %>%
mutate(moe_rank = if_else(value == 0, moe_rank, 5L)) %>%
mutate(moe_rank = ifelse(moe_rank == min(moe_rank), moe_rank, 0L)) %>%
filter(value != 0 | moe_rank != 0) %>%
select(-moe_rank)
# combine the margins of error using two methods
other_moe <- other_moe %>%
summarize(poverty_other_races_moe = sqrt(sum(moe^2))) %>%
ungroup()
# append to the original data set
tracts <- left_join(tracts, other_moe, by = c("year", "state", "county", "tract"))We convert margins of error to standard errors using 1.645 as the critical value (page 3)
tracts <- tracts %>%
mutate(
poverty_se = poverty_moe / 1.645,
poverty_black_se = poverty_black_moe / 1.645,
poverty_hispanic_se = poverty_hispanic_moe / 1.645,
poverty_other_races_se = poverty_other_races_moe / 1.645,
poverty_white_nonhispanic_se = poverty_white_nonhispanic_moe / 1.645
)tracts <- tracts %>%
select(
year,
state,
county,
tract,
year,
people,
poverty,
poverty_black,
poverty_hispanic,
poverty_other_races,
poverty_white_nonhispanic,
poverty_se,
poverty_black_se,
poverty_hispanic_se,
poverty_other_races_se,
poverty_white_nonhispanic_se,
poverty_moe,
poverty_black_moe,
poverty_hispanic_moe,
poverty_other_races_moe,
poverty_white_nonhispanic_moe
)Look at the margins of error. A large share of the Other Races and Ethnicities have coefficients of variation greater than 0.4.
tracts %>%
summarize(mean((poverty_other_races_se / poverty_other_races) > 0.4), .by = year)# A tibble: 5 × 2
year `mean((poverty_other_races_se/poverty_other_races) > 0.4)`
<dbl> <dbl>
1 2014 0.897
2 2016 0.889
3 2018 0.897
4 2021 0.921
5 2023 0.895
Let’s look at the margins of error in relation to the counts of people in each race/ethnicity category in each county. Observations to the upper left of the black line have coefficients of variation in excess of 0.4.
tracts %>%
ggplot(aes(poverty_black, poverty_black_se)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(
title = "Most Black Estimates Have Large SEs",
subtitle = "Line represents a CV of 0.4"
) +
coord_equal() +
scatter_grid() +
facet_wrap(. ~ year)tracts %>%
ggplot(aes(poverty_hispanic, poverty_hispanic_se)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(
title = "Most Hispanic Estimates Have Large SEs",
subtitle = "Line represents a CV of 0.4"
) +
coord_equal() +
scatter_grid() +
facet_wrap(. ~ year)tracts %>%
ggplot(aes(poverty_other_races, poverty_other_races_se)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(
title = "Most Other Races and Ethnicities Estimates Have Large SEs",
subtitle = "Line represents a CV of 0.4"
) +
coord_equal() +
scatter_grid() +
facet_wrap(. ~ year)tracts %>%
ggplot(aes(poverty_white_nonhispanic, poverty_white_nonhispanic_se)) +
geom_point(alpha = 0.1, size = 0.5) +
geom_abline(aes(slope = 0.4, intercept = 0)) +
labs(
title = "Most White, non-Hispanic Estimates Have Large SEs",
subtitle = "Line represents a CV of 0.4"
) +
coord_equal() +
scatter_grid() +
facet_wrap(. ~ year)As mentioned earlier, these race/ethnicity groups are not disjoint. Accordingly, summing the groups will result in population counts that exceed the population. It will also result in poverty counts that are inflated.
tracts %>%
mutate(poverty_summed = poverty_black + poverty_hispanic + poverty_other_races + poverty_white_nonhispanic) %>%
ggplot(aes(poverty, poverty_summed)) +
geom_point(alpha = 0.2, size = 1) +
geom_abline() +
coord_equal() +
labs(title = "The Counts Are Unequal because the Groups Aren't Disjoint") +
scatter_grid() +
facet_wrap(. ~ year)3. Count the number of people in poverty who live in census tracts with poverty > 40% in each county.
We turn the count of people in poverty into a rate.
tracts <- tracts %>%
mutate(poverty_rate = poverty / people)
stopifnot(min(tracts$poverty_rate) >= 0)
stopifnot(max(tracts$poverty_rate) <= 1)We calculate the rate of poverty in high-poverty tracts. We will allocate the portions of the tracts to places in a later step; however, we assume the distribution of poverty is equal within each tract, so a high-poverty tract will be high poverty for counties and places.
tracts <- tracts %>%
mutate(
high_poverty = if_else(poverty_rate > 0.4, poverty, 0),
high_poverty_black = if_else(poverty_rate > 0.4, poverty_black, 0),
high_poverty_hispanic = if_else(poverty_rate > 0.4, poverty_hispanic, 0),
high_poverty_other_races = if_else(poverty_rate > 0.4, poverty_other_races, 0),
high_poverty_white_nonhispanic = if_else(poverty_rate > 0.4, poverty_white_nonhispanic, 0)
)4. Crosswalk census tracts to census places
First we read in the tract-place crosswalk and join to our tract-level data to get tract-place pairs so we can aggregate up from tracts to places.
The original version of this metric calculated the share of people who are poor in a county who live in census tracts with poverty rates over 40%. Now we are calculating the share of people who are poor in a census place who live in census tracts with poverty rates over 40%. The county-level version of this metric was more straightforward because census tracts are completely contained within counties. The place-level version will be more difficult because places are only contained within states; they do not necessarily adhere to county or tract boundaries.
Census tract populations range from 1,200 - 8,000 with an average of 4,000 inhabitants. The smallest population in our list of places for 2021 is 74,793 (North Port city, FL), so all tracts are smaller than the places that we’re working with. However, “Tract 1” may be located in both “Place A” and “Place B” - therefore, we need to know what percentage of “Tract 1” area overlaps with the area of “Place A” and what percentage overlaps with the area of “Place B.” Then we can multiply the total population of “Tract 1” by those percentages to interpolate what share of that total population is located in “Place A” and what share is located in “Place B.” This is a technique known as areal interpolation.
First we need to know which census places have any overlap with each census tract. We construct a census tract to place crosswalk using the Missouri Census Data Center’s Geocorr 2022 tool. We construct the crosswalk using the following options:
- Input Options
- Select the state(s) (including DC and/or PR) to process:
- Select all states including DC but excluding PR
- Select one or more source geographies:
- 2010/2020 Geographies: census tract
- Select one or more target geographies:
- 2014/2020 Geographies: Place (city, town, village, CDP, etc.)
- Weighting variable:
- Population (2010/2020 census)
- Ignore census blocks with a value of 0 for the weighting variable: TRUE (select this option)
- Select the state(s) (including DC and/or PR) to process:
- Output options
- Generate second allocation factor
[AFACT2]showing portion of target geocodes in source geocodes
- Generate second allocation factor
- Geographic Filtering Options
- Combine geographic filters using:
- AND (intersection)
- Combine geographic filters using:
Then click “Run request” at the bottom of the screen. After the crosswalk finished processing I downloaded it, renamed it, and moved it to the geographic-crosswalks folder for this project.
Now we read in the tract to place crosswalk and clean it.
## this has updated geographies for the revised (in 2022) connecticut "counties"
crosswalk_20_ct <- read_csv(here::here("geographic-crosswalks", "data", "tract_crosswalk_2022_connecticut.csv")) %>%
janitor::clean_names() %>%
select(
tract_fips_2020,
tract_fips_2022
)
crosswalk_20 <- read_csv(
here::here("geographic-crosswalks", "data", "tract-place-crosswalk_2020.csv"),
skip = 1
) %>%
select(
state = `State code`,
county = `County code`,
place = `Place code`,
tract = Tract,
afact = `tract-to-place allocation factor`,
afact2 = `place-to-tract allocation factor`
) %>%
mutate(
county = substring(county, 3, 5),
state_place = str_c(state, place),
tract = str_remove(string = tract, pattern = "[.]")
) %>%
# place GEOIDs of 99999 indicate tracts that are not located within a census place
filter(place != 99999)
## adjusting for the revised 2022 CT counties
crosswalk_22 <- crosswalk_20 %>%
mutate(tract_fips_2020 = str_c(state, county, tract)) %>%
left_join(crosswalk_20_ct, by = "tract_fips_2020") %>%
mutate(
county = case_when(
!is.na(tract_fips_2022) ~ str_sub(tract_fips_2022, 3, 5),
TRUE ~ county
)
) %>%
select(-matches("tract_fips"))
crosswalk_10 <- read_csv(
here::here("geographic-crosswalks", "data", "geocorr2018_2010tract_to_2014place.csv"),
col_types = cols(
state = col_character(),
county = col_character(),
placefp = col_character(),
tract = col_character(),
afact = col_double(),
afact2 = col_double()
)
) %>%
transmute(
state = str_pad(state, side = "left", pad = "0", width = 2),
county = str_pad(county, side = "left", pad = "0", width = 5) %>% str_sub(3, 5),
place = str_pad(placefp14, side = "left", pad = "0", width = 5),
tract = str_remove(string = tract, pattern = "[.]") %>% str_pad(side = "right", pad = "0", width = 6),
state_place = str_c(state, place),
afact,
afact2
) %>%
# place GEOIDs of 99999 indicate tracts that are not located within a
# census place
filter(place != 99999)Expand the crosswalks to include a column with the years in tracts the crosswalk can apply to
# Create a tibble for the years you want to expand
years_10 <- tibble(year = c(2014, 2016, 2018))
# Perform cross join to repeat for each year
crosswalk_10 <- crosswalk_10 |>
crossing(years_10)
crosswalk <- bind_rows(
crosswalk_10,
crosswalk_20 %>% mutate(year = 2021),
crosswalk_22 %>% mutate(year = 2023)
) |>
relocate(year, .before = everything())We are only interested in places with large populations. We load the crosswalk containing those places and filter to the places of interest.
places_of_interest <-
read_csv(here::here("geographic-crosswalks", "data", "place-populations.csv")) %>%
mutate(state_place = paste0(state, place))
crosswalk <- crosswalk %>%
filter(state_place %in% places_of_interest$state_place)
crosswalk <-
inner_join(
crosswalk,
select(places_of_interest, year, state_place, place_name),
by = c("year", "state_place")
)The crosswalk contains an allocation factor variable, afact, which indicates the proportion of the source geographies (tracts) contained within the target geography (place). It also contains afact2, which is the proportion of the target geogrpahy (place) included in each source geography (tract).
We can use afact to allocate census tract data to places. The allocation is based on 2020 data and the ACS data uses 2021 data. We will use the product of afact and afact2 for a quality measure later.
crosswalk <- crosswalk %>%
mutate(afact_product = afact * afact2)Join data to match each census tract with every census place that the tract overlaps with.
tracts_joined <- left_join(tracts, crosswalk, by = c("year", "state", "county", "tract")) %>%
arrange(state_place)Many tracts are missing place because they do not overlap with any place of interest.
tracts_joined |>
filter(is.na(state_place)) |>
count(year)# A tibble: 5 × 2
year n
<dbl> <int>
1 2014 45335
2 2016 45330
3 2018 45317
4 2021 52885
5 2023 52884
tracts_joined <- tracts_joined %>%
filter(!is.na(state_place))5. Summarize the tract data to the place-level
We calculate the overall poverty and the number of people without a poverty estimate and then sum to the county level.
places_summary <- tracts_joined %>%
summarize(
people = sum(people * afact),
tracts = sum(afact),
# poverty
poverty = sum(poverty * afact),
poverty_black = sum(poverty_black * afact),
poverty_hispanic = sum(poverty_hispanic * afact),
poverty_other_races = sum(poverty_other_races * afact),
poverty_white_nonhispanic = sum(poverty_white_nonhispanic * afact),
# high poverty
high_poverty = sum(high_poverty * afact),
high_poverty_black = sum(high_poverty_black * afact),
high_poverty_hispanic = sum(high_poverty_hispanic * afact),
high_poverty_other_races = sum(high_poverty_other_races * afact),
high_poverty_white_nonhispanic = sum(high_poverty_white_nonhispanic * afact),
# standard errors
poverty_se = sqrt(sum(afact * (poverty_moe^2))),
poverty_black_se = sqrt(sum(afact * (poverty_black_moe^2))),
poverty_hispanic_se = sqrt(sum(afact * (poverty_hispanic_moe^2))),
poverty_other_races_se = sqrt(sum(afact * (poverty_other_races_moe^2))),
poverty_white_nonhispanic_se = sqrt(sum(afact * (poverty_white_nonhispanic_moe^2))),
afact_sum_product = sum(afact_product),
.by = c(state, place, state_place, place_name, year)
)
places_summary <- places_summary %>%
mutate(poverty_rate = poverty / people)There are 485 observations through 2014-2018 and 486 after 2021
places_summary |>
count(year)# A tibble: 5 × 2
year n
<dbl> <int>
1 2014 485
2 2016 485
3 2018 485
4 2021 486
5 2023 486
8[1] 8
We pull in the place-level data and compare it to the calculated place-level data. The poverty rates should be identical; however, they may differ from numbers published elsewhere (like here) that use Small-Area Income and Poverty Estimates (SAIPE).
if (!file.exists(here::here("06_neighborhoods/poverty-exposure/data/places.csv"))) {
places_test <-
map(years, ~ get_poverty_status_data(.x, "place")) |>
list_rbind() |>
relocate(year, .before = everything())
write_csv(places_test, here("06_neighborhoods/poverty-exposure/data/places.csv"))
}places_test <- read_csv(here::here("06_neighborhoods/poverty-exposure/data/places.csv"))
places_test <- places_test %>%
mutate(state_place = paste0(state, place)) |>
filter(state_place %in% places_of_interest$state_place) |>
mutate(poverty_rate = poverty / people) %>%
# select(year, state, place, state_place, poverty, poverty_rate, people) %>%
arrange(year, state, place, state_place)# join data
test_joined <- inner_join(
places_summary,
places_test,
by = c("state", "place", "state_place"),
suffix = c("_interpolated", "_reported")
)
test_joined %>%
ggplot(aes(people_reported, people_interpolated)) +
geom_abline() +
geom_point(alpha = 0.2) +
coord_equal() +
scatter_grid() +
labs(title = "Reported population and interpolated population are similar")test_joined %>%
ggplot(aes(poverty_reported, poverty_interpolated)) +
geom_abline() +
geom_point(alpha = 0.2) +
coord_equal() +
scatter_grid() +
labs(title = "Reported poverty and interpolated poverty are similar")test_joined %>%
ggplot(aes(poverty_rate_reported, poverty_rate_interpolated)) +
geom_abline() +
geom_point(alpha = 0.2) +
coord_equal() +
scatter_grid() +
labs(title = "Reported poverty rate and interpolated poverty rate are similar")bind_rows(
reported = places_test,
interpolated = places_summary,
.id = "source"
) %>%
select(
year,
state_place,
source,
poverty_black,
poverty_white_nonhispanic,
poverty_hispanic
) %>%
pivot_longer(-c(year, state_place, source), names_to = "var", values_to = "value") %>%
pivot_wider(names_from = "source", values_from = "value") %>%
ggplot(aes(reported, interpolated, color = var)) +
geom_abline() +
geom_point(alpha = 0.1) +
facet_wrap(~var) +
coord_equal() +
scatter_grid() +
labs(
title = "Reported poverty and interpolated poverty are similar",
subtitle = "By race/ethncity subgroup"
)bind_rows(
reported = places_test,
interpolated = places_summary,
.id = "source"
) %>%
filter(people < 200000) %>%
select(
year,
state_place,
source,
poverty_black,
poverty_white_nonhispanic,
poverty_hispanic
) %>%
pivot_longer(-c(year, state_place, source), names_to = "var", values_to = "value") %>%
pivot_wider(names_from = "source", values_from = "value") %>%
ggplot(aes(reported, interpolated, color = var)) +
geom_abline() +
geom_point(alpha = 0.1) +
facet_wrap(~var) +
coord_equal() +
scatter_grid() +
labs(
title = "Reported poverty and interpolated poverty are similar",
subtitle = "By race/ethncity subgroup for places with fewer than 200,000 people"
)6. Divide high poverty by total poverty
We need the conditional logic to deal with division by zero. If there is no poverty then poverty exposure is zero.
places_summary <- places_summary %>%
mutate(
share_poverty_exposure = high_poverty / poverty,
share_poverty_exposure_black =
if_else(condition = poverty_black > 0,
true = high_poverty_black / poverty_black,
false = 0
),
share_poverty_exposure_hispanic =
if_else(condition = poverty_hispanic > 0,
true = high_poverty_hispanic / poverty_hispanic,
false = 0
),
share_poverty_exposure_other_races =
if_else(condition = poverty_other_races > 0,
true = high_poverty_other_races / poverty_other_races,
false = 0
),
share_poverty_exposure_white_nonhispanic =
if_else(condition = poverty_white_nonhispanic > 0,
true = high_poverty_white_nonhispanic / poverty_white_nonhispanic,
false = 0
),
)
# This checks whether there are any missing values for any of the variables in places_summary
stopifnot(
all(map_dbl(places_summary, ~ sum(is.na(.x))) == 0)
)Overall
Interestingly, college towns dominate the list.
places_summary %>%
arrange(desc(share_poverty_exposure)) %>%
select(state_place, place_name, share_poverty_exposure, poverty_rate)# A tibble: 2,427 × 4
state_place place_name share_poverty_exposure poverty_rate
<chr> <chr> <dbl> <dbl>
1 4263624 Reading city 0.801 0.401
2 4815976 College Station city 0.750 0.346
3 2621000 Dearborn city 0.747 0.286
4 4815976 College Station city 0.737 0.320
5 4815976 College Station city 0.682 0.279
6 1225175 Gainesville city 0.680 0.362
7 3673000 Syracuse city 0.670 0.351
8 2621000 Dearborn city 0.660 0.291
9 4263624 Reading city 0.660 0.393
10 2621000 Dearborn city 0.649 0.283
# ℹ 2,417 more rows
Black
places_summary %>%
arrange(desc(share_poverty_exposure_black)) %>%
select(state_place, place_name, share_poverty_exposure_black, poverty_rate, poverty_black)# A tibble: 2,427 × 5
state_place place_name share_poverty_exposu…¹ poverty_rate poverty_black
<chr> <chr> <dbl> <dbl> <dbl>
1 4848768 Mission city 1 0.265 1
2 4857200 Pharr city 1 0.285 7
3 4857200 Pharr city 1 0.326 68.0
4 4815976 College Statio… 0.926 0.346 2489.
5 4845384 McAllen city 0.922 0.215 192.
6 4263624 Reading city 0.837 0.401 3960
7 2717000 Duluth city 0.826 0.227 1384.
8 0832155 Greeley city 0.824 0.206 1050.
9 4815976 College Statio… 0.794 0.320 2398.
10 0807850 Boulder city 0.744 0.219 339.
# ℹ 2,417 more rows
# ℹ abbreviated name: ¹share_poverty_exposure_black
Hispanic
places_summary %>%
arrange(desc(share_poverty_exposure_hispanic)) %>%
select(state_place, place_name, share_poverty_exposure_hispanic, poverty_rate, poverty_hispanic)# A tibble: 2,427 × 5
state_place place_name share_poverty_exposu…¹ poverty_rate poverty_hispanic
<chr> <chr> <dbl> <dbl> <dbl>
1 4263624 Reading city 0.813 0.401 24911
2 3673000 Syracuse ci… 0.781 0.351 6039
3 2629000 Flint city 0.767 0.419 1919
4 2629000 Flint city 0.740 0.404 1749
5 1225175 Gainesville… 0.735 0.350 4950.
6 0103076 Auburn city 0.731 0.263 560.
7 4815976 College Sta… 0.728 0.346 4619.
8 4815976 College Sta… 0.722 0.279 5970.
9 1225175 Gainesville… 0.715 0.362 5181.
10 4815976 College Sta… 0.714 0.320 4471.
# ℹ 2,417 more rows
# ℹ abbreviated name: ¹share_poverty_exposure_hispanic
Other Races
places_summary %>%
arrange(desc(share_poverty_exposure_other_races)) %>%
select(state_place, place_name, share_poverty_exposure_other_races, poverty_rate, poverty_other_races)# A tibble: 2,427 × 5
state_place place_name share_poverty_exposure_other_…¹ poverty_rate
<chr> <chr> <dbl> <dbl>
1 4263624 Reading city 0.821 0.401
2 4815976 College Station city 0.811 0.346
3 2621000 Dearborn city 0.810 0.286
4 4815976 College Station city 0.756 0.279
5 1270600 Tallahassee city 0.752 0.262
6 3673000 Syracuse city 0.745 0.351
7 2629000 Flint city 0.727 0.419
8 4815976 College Station city 0.726 0.320
9 1712385 Champaign city 0.708 0.235
10 1712385 Champaign city 0.707 0.232
# ℹ 2,417 more rows
# ℹ abbreviated name: ¹share_poverty_exposure_other_races
# ℹ 1 more variable: poverty_other_races <dbl>
White, Non-Hispanic
places_summary %>%
arrange(desc(share_poverty_exposure_white_nonhispanic)) %>%
select(state_place, place, place_name, share_poverty_exposure_white_nonhispanic, poverty_rate, poverty_white_nonhispanic)# A tibble: 2,427 × 6
state_place place place_name share_poverty_exposu…¹ poverty_rate
<chr> <chr> <chr> <dbl> <dbl>
1 2622000 22000 Detroit city 0.770 0.394
2 1225175 25175 Gainesville city 0.754 0.362
3 2621000 21000 Dearborn city 0.751 0.286
4 4815976 15976 College Station city 0.737 0.320
5 2622000 22000 Detroit city 0.726 0.398
6 4815976 15976 College Station city 0.719 0.346
7 4263624 63624 Reading city 0.715 0.401
8 1225175 25175 Gainesville city 0.704 0.350
9 1303440 03440 Athens-Clarke County u… 0.675 0.359
10 2621000 21000 Dearborn city 0.670 0.291
# ℹ 2,417 more rows
# ℹ abbreviated name: ¹share_poverty_exposure_white_nonhispanic
# ℹ 1 more variable: poverty_white_nonhispanic <dbl>
There shouldn’t be any missing values.
stopifnot(
places_summary %>%
filter(is.na(share_poverty_exposure)) %>%
nrow() == 0
)7. Validation
“All” file
The table shows the calculated metrics. Click on the variable columns to sort the table.
places_summary %>%
ggplot(aes(share_poverty_exposure)) +
geom_histogram() +
scale_y_continuous(
limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))
) +
labs(
title = "Most Places in 2021 Have No Poverty Exposure",
subtitle = "The Distribution of Poverty Exposure"
) +
facet_wrap(. ~ year)places_summary %>%
ggplot(aes(tracts, share_poverty_exposure)) +
geom_point(
alpha = 0.2,
size = 1
) +
scale_y_continuous(
limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))
) +
scatter_grid() +
labs(
title = "Most Extreme Poverty Exposure Values are for Small Places",
x = "Number of Tracts in Place"
) +
facet_wrap(. ~ year)places_summary %>%
ggplot(aes(people, share_poverty_exposure)) +
geom_point(
alpha = 0.2,
size = 1
) +
scale_y_continuous(
limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))
) +
scatter_grid() +
labs(
title = "Most Extreme Poverty Exposure Values are for Small Counties",
x = "Population in Place"
) +
facet_wrap(. ~ year)places_summary %>%
ggplot(aes(poverty_rate, share_poverty_exposure)) +
geom_point(
alpha = 0.2,
size = 1
) +
scale_y_continuous(
limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))
) +
scatter_grid() +
labs(title = "Place Poverty Rate and Place Poverty Exposure Are Related") +
facet_wrap(. ~ year)Subgroups File
places_summary_subgroups_plots <- places_summary %>%
select(year, state, place, contains("exposure")) %>%
# pivot to very long
pivot_longer(c(-year, -state, -place), names_to = "subgroup", values_to = "share_poverty_exposure") %>%
# clean up names
mutate(
subgroup =
recode(
subgroup,
share_poverty_exposure = "All",
share_poverty_exposure_black = "Black",
share_poverty_exposure_hispanic = "Hispanic",
share_poverty_exposure_other_races = "Other Races and Ethnicities",
share_poverty_exposure_white_nonhispanic = "White, Non-Hispanic"
)
)
places_summary_subgroups_plots %>%
filter(subgroup != "All") %>%
ggplot(aes(share_poverty_exposure)) +
geom_histogram() +
scale_y_continuous(
limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))
) +
facet_wrap(~subgroup) +
labs(
title = "Most Places in 2021 Have No Poverty Exposure",
subtitle = "The Distribution of Poverty Exposure"
) +
facet_wrap(. ~ year)places_summary_subgroups_plots <- left_join(places_summary_subgroups_plots, select(places_summary, -share_poverty_exposure), by = c("year", "state", "place"))
places_summary_subgroups_plots %>%
filter(subgroup != "All") %>%
ggplot(aes(tracts, share_poverty_exposure)) +
geom_point(
alpha = 0.2,
size = 1
) +
scale_y_continuous(
limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))
) +
facet_wrap(~subgroup, nrow = 2) +
scatter_grid() +
labs(
title = "Most Extreme Poverty Exposure Values are for Small Places",
x = "Number of Tracts in Place"
) +
facet_wrap(. ~ year)places_summary_subgroups_plots %>%
filter(subgroup != "All") %>%
ggplot(aes(people, share_poverty_exposure)) +
geom_point(
alpha = 0.2,
size = 1
) +
scale_y_continuous(
limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))
) +
facet_wrap(~subgroup) +
scatter_grid() +
labs(
title = "Most Extreme Poverty Exposure Values are for Small Places",
x = "Population in Place"
) +
facet_wrap(. ~ year)places_summary_subgroups_plots %>%
filter(subgroup != "All") %>%
ggplot(aes(poverty_rate, share_poverty_exposure)) +
geom_point(
alpha = 0.2,
size = 1
) +
scale_y_continuous(
limits = c(0, NA),
expand = expansion(mult = c(0, 0.2))
) +
facet_wrap(~subgroup) +
scatter_grid() +
labs(title = "Place Poverty Rate and Place Poverty Exposure Are Related") +
facet_wrap(. ~ year)rm(places_summary_subgroups_plots)8. Quality Flags
We consider three dimensions of quality when developing the quality variables for poverty exposure.
- The unweighted number of observations behind each calculation.
- The coefficient of variation for poverty in the census place.
- The overlap of census place (target geography) and the census tracts (source geographies).
1. Unweighted number of observations
We suppress any estimates with thirty or fewer unweighted observations.
#' Suppress counties
#'
#' @param race The variable for the count in a race/ethnicity group
#' @param exposure The variable name for the exposure index
#' @param threshold The minimum size of the race group to report the exposure index
#'
#' @return
#'
suppress_place <- function(race, exposure, threshold) {
exposure <- if_else(race <= threshold, as.numeric(NA), exposure)
return(exposure)
}places_summary %>%
summarize(
all = sum(is.na(share_poverty_exposure)),
black_nh = sum(is.na(share_poverty_exposure_black)),
hispanic = sum(is.na(share_poverty_exposure_hispanic)),
other_nh = sum(is.na(share_poverty_exposure_other_races)),
white_nh = sum(is.na(share_poverty_exposure_white_nonhispanic))
)# A tibble: 1 × 5
all black_nh hispanic other_nh white_nh
<int> <int> <int> <int> <int>
1 0 0 0 0 0
places_summary <- places_summary %>%
mutate(
# overall
share_poverty_exposure =
suppress_place(
race = poverty,
exposure = share_poverty_exposure,
threshold = 30
),
# black
share_poverty_exposure_black =
suppress_place(
race = poverty_black,
exposure = share_poverty_exposure_black,
threshold = 30
),
# hispanic
share_poverty_exposure_hispanic =
suppress_place(
race = poverty_hispanic,
exposure = share_poverty_exposure_hispanic,
threshold = 30
),
# other races and ethnicities
share_poverty_exposure_other_races =
suppress_place(
race = poverty_other_races,
exposure = share_poverty_exposure_other_races,
threshold = 30
),
# white, non-hispanic
share_poverty_exposure_white_nonhispanic =
suppress_place(
race = poverty_white_nonhispanic,
exposure = share_poverty_exposure_white_nonhispanic,
threshold = 30
)
)
places_summary %>%
summarize(
all = sum(is.na(share_poverty_exposure)),
black_nh = sum(is.na(share_poverty_exposure_black)),
hispanic = sum(is.na(share_poverty_exposure_hispanic)),
other_nh = sum(is.na(share_poverty_exposure_other_races)),
white_nh = sum(is.na(share_poverty_exposure_white_nonhispanic))
)# A tibble: 1 × 5
all black_nh hispanic other_nh white_nh
<int> <int> <int> <int> <int>
1 0 23 1 0 0
2. Coefficient of variation
The coefficient of variation is a standard measure of precision normalized by the magnitude of an estimate. In this case it is \(\frac{SE(\hat{count})}{\hat{count}}\). We calculate the coefficient of variation for each poverty estimate.
We don’t calculate the CV at the tract-level or for high poverty.
places_summary <- places_summary %>%
mutate(
poverty_cv = poverty_se / poverty,
poverty_black_cv = poverty_black_se / poverty_black,
poverty_hispanic_cv = poverty_hispanic_se / poverty_hispanic,
poverty_other_races_cv = poverty_other_races_se / poverty_other_races,
poverty_white_nonhispanic_cv = poverty_white_nonhispanic_se / poverty_white_nonhispanic
)
places_summary %>%
filter(poverty_cv >= 0.4) %>%
ggplot(aes(poverty, poverty_cv, color = poverty <= 30)) +
geom_point(alpha = 0.2) +
labs(
title = "The Worst CVs Will be Dropped for n <= 30",
subtitle = "Places with CV > 0.4, poverty <= 30 in yellow"
) +
scatter_grid()places_summary %>%
filter(poverty_black_cv >= 0.4) %>%
ggplot(aes(poverty_black, poverty_black_cv, color = poverty_black <= 30)) +
geom_point(alpha = 0.2) +
labs(
title = "Black: The Worst CVs Will be Dropped for n <= 30",
subtitle = "Places with CV > 0.4, poverty_black <= 30 in yellow"
) +
scatter_grid()places_summary %>%
filter(poverty_hispanic_cv >= 0.4) %>%
ggplot(aes(poverty_hispanic, poverty_hispanic_cv, color = poverty_hispanic <= 30)) +
geom_point(alpha = 0.2) +
labs(
title = "Hispanic: The Worst CVs Will be Dropped for n <= 30",
subtitle = "Places with CV > 0.4, poverty_hispanic <= 30 in yellow"
) +
scatter_grid()places_summary %>%
filter(poverty_other_races_cv >= 0.4) %>%
ggplot(aes(poverty_other_races, poverty_other_races_cv, color = poverty_other_races <= 30)) +
geom_point(alpha = 0.2) +
labs(
title = "Other Races and Ethnicities: The Worst CVs Will be Dropped for n <= 30",
subtitle = "Places with CV > 0.4, poverty_other_races <= 30 in yellow"
) +
scatter_grid()places_summary %>%
filter(poverty_white_nonhispanic_cv >= 0.4) %>%
ggplot(aes(poverty_white_nonhispanic, poverty_white_nonhispanic_cv, color = poverty_white_nonhispanic <= 30)) +
geom_point(alpha = 0.2) +
labs(
title = "White, non_hispanic: The Worst CVs Will be Dropped for n <= 30",
subtitle = "Places with CV > 0.4, white_nh <= 30 in yellow"
) +
scatter_grid()3. Overlap between census tracts and census places
Areal interpolation reduces the precision of our estimates. The visualizations above demonstrate that there is a tight connection between our interpolated estimates and the estimates reported directly at the census place level.
We still develop a measure of the amount of data shared by the target geography and source geographies. We use an approach developed by Greg Acs and Kevin Werner for other spatial interpolations. The idea is to weight the proportion of tract data in a census place by the proportion of the census place in the tract. Consider a few examples:
- If
afactandafact2are both 1, then the census tract and census place share the same borders. - If
afactis < 1 andafact2is 1, the census tract spans the place but the place is entirely in the tract. This is impossible. - If
afactis 1 andafact2is < 1, then the census place is spread over multiple tracts.afactandafact2are multiplied together and summed for each instance of the county. So if the place is spread perfectly among two tracts,afact2will be 0.5 for each row, the product ofafactandafact2will be 0.5, and the sum will 1 one, meaning we know where 100% of the places’s data comes from. - If both
afactandafact2are < 1, then the result is a combination of previous two examples. There will be multiple instances of rows to be summed, but the total sum will likely be less than 1.
We performed these calculations above.
All proportions exceed 0.75. This indicates that there is a tight connection between the census tracts and the census places. This unsurprising since we only focus on census places with large populations.
summary(places_summary$afact_sum_product) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.7524 0.9262 0.9669 0.9534 0.9927 1.0008
4. Data Quality
We need to add data quality flags with 1, 2, or 3. The overlap between census tracts and census places is high in all cases. Therefore we will only suppress values based on sample size and downgrade observations based on CVs. The values are outlined in the data standards.
1- If the county coefficient of variation for the count in the group is less than 0.22- If the county coefficient of variation for the count in the group is less than 0.43- If the county coefficient of variation for the count in the group exceeds 0.4NA- If the metric is missing
#' Assign a data quality flag
#'
#' @param race A vector of counts of a race/ethnicity group within a county
#' @param exposure A race/ethnicity exposure metric
#'
#' @return A numeric data quality flag
#'
set_quality <- function(cv, exposure) {
quality <- case_when(
cv < 0.2 ~ 1,
cv < 0.4 ~ 2,
cv >= 0.4 ~ 3
)
quality <- if_else(is.na(exposure), as.numeric(NA), quality)
return(quality)
}
places_summary <- places_summary %>%
mutate(
share_poverty_exposure_quality = set_quality(
cv = poverty_cv,
exposure = share_poverty_exposure
),
share_poverty_exposure_black_quality = set_quality(
cv = poverty_black_cv,
exposure = share_poverty_exposure_black
),
share_poverty_exposure_hispanic_quality = set_quality(
cv = poverty_hispanic_cv,
exposure = share_poverty_exposure_hispanic
),
share_poverty_exposure_other_races_quality = set_quality(
cv = poverty_other_races_cv,
exposure = share_poverty_exposure_other_races
),
share_poverty_exposure_white_nonhispanic_quality = set_quality(
cv = poverty_white_nonhispanic_cv,
exposure = share_poverty_exposure_white_nonhispanic
)
)
count(places_summary, share_poverty_exposure_quality)# A tibble: 2 × 2
share_poverty_exposure_quality n
<dbl> <int>
1 1 2316
2 2 111
count(places_summary, share_poverty_exposure_black_quality)# A tibble: 4 × 2
share_poverty_exposure_black_quality n
<dbl> <int>
1 1 840
2 2 725
3 3 839
4 NA 23
count(places_summary, share_poverty_exposure_hispanic_quality)# A tibble: 4 × 2
share_poverty_exposure_hispanic_quality n
<dbl> <int>
1 1 1068
2 2 997
3 3 361
4 NA 1
count(places_summary, share_poverty_exposure_other_races_quality)# A tibble: 3 × 2
share_poverty_exposure_other_races_quality n
<dbl> <int>
1 1 1066
2 2 1121
3 3 240
count(places_summary, share_poverty_exposure_white_nonhispanic_quality)# A tibble: 3 × 2
share_poverty_exposure_white_nonhispanic_quality n
<dbl> <int>
1 1 1802
2 2 575
3 3 50
Most of the counties with missing values are very small.
missing <- places_summary %>%
filter(
is.na(share_poverty_exposure) |
is.na(share_poverty_exposure_black) |
is.na(share_poverty_exposure_hispanic) |
is.na(share_poverty_exposure_other_races) |
is.na(share_poverty_exposure_white_nonhispanic)
)
max(missing$people)[1] 110694.2
max(missing$tracts)[1] 25.6416
9. Save the data
All File
We need to include all counties in the published data even if we don’t have a metric for the county. We load the county file and join our metrics to the county file.
final_data <- places_summary %>%
select(
year,
state,
place,
share_poverty_exposure,
share_poverty_exposure_quality
) |>
arrange(year, state, place)
write_csv(
final_data,
here::here("06_neighborhoods", "poverty-exposure", "final", "poverty-exposure_city.csv")
)Subgroup File
# create a long version of the subgroup data
places_summary_subgroups <- places_summary %>%
select(state, place, year, contains("exposure")) %>%
# pivot to very long
pivot_longer(c(-state, -place, -year), names_to = "subgroup", values_to = "share_poverty_exposure") %>%
# create new variable names
mutate(variable = if_else(str_detect(subgroup, "_quality"),
"share_poverty_exposure_quality",
"share_poverty_exposure"
)) %>%
mutate(subgroup = str_replace(subgroup, "_quality", "")) %>%
# pivot to long
pivot_wider(names_from = variable, values_from = share_poverty_exposure) %>%
# clean up names
mutate(
subgroup =
recode(
subgroup,
share_poverty_exposure = "All",
share_poverty_exposure_black = "Black",
share_poverty_exposure_hispanic = "Hispanic",
share_poverty_exposure_other_races = "Other Races and Ethnicities",
share_poverty_exposure_white_nonhispanic = "White, Non-Hispanic"
)
)
# check the bounds of the poverty exposure metric
stopifnot(min(places_summary_subgroups$share_poverty_exposure, na.rm = TRUE) >= 0)
stopifnot(max(places_summary_subgroups$share_poverty_exposure, na.rm = TRUE) <= 1)places_summary_subgroups <- places_summary_subgroups %>%
mutate(subgroup_type = if_else(subgroup == "All", "all", "race-ethnicity"))
# create a frame with all possible rows
all_places_subgroups <-
expand_grid(
places_of_interest |> filter(year %in% years),
subgroup = c("All", "Black", "Hispanic", "Other Races and Ethnicities", "White, Non-Hispanic")
) %>%
mutate(subgroup_type = if_else(subgroup == "All", "all", "race-ethnicity"))
final_data_race_ethnicity <- left_join(
all_places_subgroups,
places_summary_subgroups,
by = c("year", "state", "place", "subgroup_type", "subgroup")
) %>%
select(
year,
state,
place,
subgroup_type,
subgroup,
share_poverty_exposure,
share_poverty_exposure_quality
) |>
arrange(year, state, place)
write_csv(
final_data_race_ethnicity,
here::here("06_neighborhoods", "poverty-exposure", "final", "poverty-exposure_race-ethnicity_city.csv")
)Testing
source(here::here("functions/testing/evaluate_final_data.R"))
final_city_data <- read_csv(here::here("06_neighborhoods", "poverty-exposure", "final", "poverty-exposure_city.csv"), show_col_types = FALSE)
evaluate_final_data(
exp_form_path = "10a_final-evaluation/evaluation_form_exposure_poverty_overall_city.csv",
data = final_city_data, geography = "place",
subgroups = FALSE, confidence_intervals = FALSE
)[1] "This data passes all tests!"
final_city_data_subgroups <- read_csv(here::here("06_neighborhoods/poverty-exposure/final/poverty-exposure_race-ethnicity_city.csv"), show_col_types = FALSE)
evaluate_final_data(
exp_form_path = "10a_final-evaluation/evaluation_form_exposure_poverty_race_eth_city.csv",
data = final_city_data_subgroups, geography = "place",
subgroups = TRUE, confidence_intervals = FALSE
)[1] "This data passes all tests!"